Package installation

We will use the following packages for this example:

if (F) {
  install.packages(c("devtools", "earth", "ggplot2", "mgcv", "mlbench", "plotmo"))
  devtools::install_github("ck37/ck37r")
}

library(ck37r)
library(devtools)
library(ggplot2)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-17. For overview type 'help("mgcv-package")'.
library(mlbench)
library(plotmo)
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(earth)

Goals

Using the “PimaIndiansDiabetes2” dataset, construct splines, generalized additice models (GAMS), and multivariate additive regression models (MARS) to predict blood pressure via the other variables as predictors. Missing data will be median-imputed and indicators will be created to document their missingness.

Preprocess the data

# load the dataset
data(PimaIndiansDiabetes2)
?PimaIndiansDiabetes2
data <- PimaIndiansDiabetes2 # give the data a simpler name
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 NA 70 96 ...
##  $ triceps : num  35 29 NA 23 35 NA 32 NA 45 NA ...
##  $ insulin : num  NA NA NA 94 168 NA 88 NA 543 NA ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

Check for missing data:

# check for missing cases
sum(is.na(data)) 
## [1] 652
# how much of the data is missing? 
sum(is.na(data)) / (nrow(data)*ncol(data)) # about 9% 
## [1] 0.0943287

Recode the “diabetes” vector to numeric type:

data$diabetes <- ifelse(data$diabetes=="pos", 1, 0)

Use Chris K’s handy median impute function to impute missing values:

# impute and add missingness indicators
result = ck37r::impute_missing_values(data) 

# overwrite "data" with new imputed data frame
data <- result$data 

Double check that missing values have been imputed:

# no more NA values
sum(is.na(data))
## [1] 0
# check that missingness indicators have been added
str(data)
## 'data.frame':    768 obs. of  14 variables:
##  $ pregnant     : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose      : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure     : num  72 66 64 66 40 74 50 72 70 96 ...
##  $ triceps      : num  35 29 29 23 35 29 32 29 45 29 ...
##  $ insulin      : num  125 125 125 94 168 125 88 125 543 125 ...
##  $ mass         : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 32.3 ...
##  $ pedigree     : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age          : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes     : num  1 0 1 0 1 0 1 0 1 1 ...
##  $ miss_glucose : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ miss_pressure: num  0 0 0 0 0 0 0 1 0 0 ...
##  $ miss_triceps : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ miss_insulin : num  1 1 1 0 0 1 0 1 0 1 ...
##  $ miss_mass    : num  0 0 0 0 0 0 0 0 0 1 ...

Parse our data into our Y outcome variable and X predictor variables:

# define Y
pressure <- "pressure" # for Y and X assignment purposes
Y <- data$pressure # for modeling purposes

# define X predictors
X <- data[, !names(data) == pressure]

# double check that the "pressure" vector has been removed
str(X)
## 'data.frame':    768 obs. of  13 variables:
##  $ pregnant     : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose      : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ triceps      : num  35 29 29 23 35 29 32 29 45 29 ...
##  $ insulin      : num  125 125 125 94 168 125 88 125 543 125 ...
##  $ mass         : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 32.3 ...
##  $ pedigree     : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age          : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes     : num  1 0 1 0 1 0 1 0 1 1 ...
##  $ miss_glucose : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ miss_pressure: num  0 0 0 0 0 0 0 1 0 0 ...
##  $ miss_triceps : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ miss_insulin : num  1 1 1 0 0 1 0 1 0 1 ...
##  $ miss_mass    : num  0 0 0 0 0 0 0 0 0 1 ...

Beyond linear models

So far, we have briefly covered linear models. However, these are potentially limited in their predictive power because they always assume linear relationships in the data. Last semester, we talked about improving linear regression models via penalized regression (LASSO and ridge) and this semester via stepwise selection (dividing the data into equal regions and fitting a function in “stepewise” fashion). However, these methods still make linear assumptions about the data and least squares are still used to estimate the regression coefficients.

There exist great variery of nonlinear regression methods, such as polynomial regression, step functions, regression splines, smoothing splines, local regression (LOESS), generalized additive models (GAMS), and multivariate additive regression models (MARS).

Regression splines are more flexible extensions of polynomial and piecewise regressions because they fit a polynomial funciton to the data within each piecewise region - constraints at region borders allow them to join smoothly. Smoothing splines are similar to this idea, but accept a smoothing penalty to adjust for minimization of their residual sum of square values. Local regression is yet another extension of these methods, except the fitted functions are allowed to overlap neighboring data regions.

Polynomial models provide a simple glimpse of how non-linear predictors are related to an outcome variable. For example:

# fit a 3rd degree polynomial regression
poly1 <- lm(pressure ~ poly(glucose, 3), data=data)
summary(poly1) # the output still resembles OLS regression
## 
## Call:
## lm(formula = pressure ~ poly(glucose, 3), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.722  -6.903  -0.403   6.856  52.638 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        72.3867     0.4245 170.522  < 2e-16 ***
## poly(glucose, 3)1  73.3469    11.7641   6.235 7.48e-10 ***
## poly(glucose, 3)2 -18.4078    11.7641  -1.565   0.1181    
## poly(glucose, 3)3 -27.9648    11.7641  -2.377   0.0177 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.76 on 764 degrees of freedom
## Multiple R-squared:  0.05792,    Adjusted R-squared:  0.05422 
## F-statistic: 15.66 on 3 and 764 DF,  p-value: 6.851e-10
plot(pressure~glucose, data=data) # plot the scatter

points(data$glucose, fitted(poly1), col="blue", pch=20) # add the polynomial "curve"

# how does this compare to a linear model? 

# fit a linear model
lm1 <- lm(pressure ~ glucose, data=data) 

# view the coefficients for plotting
lm1$coefficients
## (Intercept)     glucose 
##   61.801531    0.087009
# plot the linear best fit for comparison
abline(lm1$coefficients[1], lm1$coefficients[2], col="red", lwd=2)

Although the linear and polynomial fits are similar, they are not identical and their fits change differently throughout the scatterplot. Thus, it is important to consider nonlinear relationships in your data even if linearity is assumed.

Furthermore, note that the polynomial function “curve” (the blue dots) is actually a series of points along fairly equally spaced intervals. To conceptualize “piecewise” regression methods, imagine that a straight line is connecting each of the blue dots in the polynomial “curve”. This would instead create many connected lines where each point is a “knot” spaced out into equally spaced bins so that each directional change is a “hinge” in the line. Thus, the line is not continuous. To truly fit a curve that estimates nonlinear predictor relationships, you might want to instead utilize a smoothing function of some sort (see below).

Regression splines

Splines further extend the idea of polynomial and piecewise regression approaches to predict our outcome variable when the relationships among the predictors are nonlinear. Simply put, a spline is a curve that connects two or more points. This approach can more appropriately approximate the relationships in a dataset because it better “touches” more data points due to its curved nature (as opposed to a straight line).

See Introduction to Statistical Learning by James, et al., 2013 - Chapter 7 for detailed explanations and coding examples.

Let’s fit a spline using our example from above:

# fit a cubic smoothing spline
smooth1 <- smooth.spline(y=data$pressure, x=data$glucose, 
                         spar=NULL,
                         cv=FALSE)
smooth1
## Call:
## smooth.spline(x = data$glucose, y = data$pressure, spar = NULL, 
##     cv = FALSE)
## 
## Smoothing Parameter  spar= 0.9495251  lambda= 0.09389132 (12 iterations)
## Equivalent Degrees of Freedom (Df): 3.946397
## Penalized Criterion: 13855.96
## GCV: 139.2193
plot(smooth1, main="smooth1 cubic spline", xlab="glucose", ylab="pressure", col="blue") 

# or, to make it a line
lines(smooth1, col="green", lwd=4)

# fit this "loess" smoothed curve to the scatterplot
scatter1 <- scatter.smooth(data$pressure ~ data$glucose, 
                           lpars=list(col="blue", lwd=3, lty=1),
                           main="loess spline")

The basic summary information provided by calling “smooth1”.

spar= is the smoothing parameter and if not specified defaults to NULL and instead uses the degrees of freedom to approximate the degree of smoothing. Notice that spar=0.949. What happens if you change it to 0 or to 1.5?

cv= allows us to specify ordinary or generalized cross-validation (GCV).

lambda is the shrinkage parameter. Do you remember using this to determine ridge versus LASSO? (hint: 0 gives ridge regression while 1 gives LASSO).

“loess” (locally weighted scatterplot smoothing) is a basic non-parametric strategy for fitting a smoothed curve to a scatterplot. Check out William G. Jacoby’s fun paper to learn more. Jacoby WG. 2000. Loess: A nonparametric, graphical tool for depicting relationships between variables

Generalized additive models (GAMS)

So far, we have only looked at estimating an outcome variable using a single predictor. However, when considering multilple predictor variables, an extension of multiple linear regression should be used - generalized additive models.

Thus, GAMS provide smoothed, nonlinear relationships between each predictor and the outcome variable. Each relationship is computed and then summed.

First, let’s recombine our outcome and predictor variables back into the same data frame called df:

df <- cbind(Y, X)
str(df)
## 'data.frame':    768 obs. of  14 variables:
##  $ Y            : num  72 66 64 66 40 74 50 72 70 96 ...
##  $ pregnant     : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose      : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ triceps      : num  35 29 29 23 35 29 32 29 45 29 ...
##  $ insulin      : num  125 125 125 94 168 125 88 125 543 125 ...
##  $ mass         : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 32.3 ...
##  $ pedigree     : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age          : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes     : num  1 0 1 0 1 0 1 0 1 1 ...
##  $ miss_glucose : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ miss_pressure: num  0 0 0 0 0 0 0 1 0 0 ...
##  $ miss_triceps : num  0 0 1 0 0 1 0 1 0 1 ...
##  $ miss_insulin : num  1 1 1 0 0 1 0 1 0 1 ...
##  $ miss_mass    : num  0 0 0 0 0 0 0 0 0 1 ...

Now, s should precede each of our predictor variables to ensure that the model is being properly constructed using spline-based smoothing.

The fx logical parameter can be added inside each s predictor variable to specify a fixed (TRUE) or penalized regression spline (FALSE). bs can also be added to specify the type of smoothing (eg., tp for thin plate, cr for cubic, etc.). A variety of other parameters can be specified.

Fit the GAM:

gam1 <- gam(Y ~ s(pregnant) + s(glucose) + s(triceps) + s(insulin) + s(mass) + s(age), 
            family="gaussian",
            data=df)

# plot weighted partial residuals (dropping the specified term while including all others)
plot(gam1, pages=1, residuals=TRUE)

# plot seWithMean will show component smooths with confidence intervals that include overall mean uncertainty
plot(gam1, pages=1, seWithMean=TRUE) # is this is the same thing as "residuals=FALSE"?

# or, to view them one at a time:
plot(gam1, seWithMean=TRUE)

# view summary output
gam.check(gam1)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 16 iterations.
## The RMS GCV score gradient at convergence was 3.939816e-06 .
## The Hessian was positive definite.
## Model rank =  55 / 55 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##               k'  edf k-index p-value
## s(pregnant) 9.00 4.82    1.02    0.65
## s(glucose)  9.00 1.82    1.04    0.87
## s(triceps)  9.00 1.00    1.07    0.96
## s(insulin)  9.00 1.00    1.02    0.72
## s(mass)     9.00 2.39    1.04    0.92
## s(age)      9.00 2.04    0.99    0.36

explain plot outputs here

In gam.check summary output: k' is …

edf is …

k-index is …

p-value is …

3D GAM plots

The “plotmo” R package offers a great way to visualize regression splines in three dimensions:

plotmo(gam1, all2=TRUE) # show simplfied seWithMean plots AND three dimensional splines for all variable relationships
##  plotmo grid:    pregnant glucose triceps insulin mass age
##                         3     117      29     125 32.3  29

plotmo(gam1, all2=TRUE, pmethod = "partdep") # plot partial dependencies (takes a few minutes)
## calculating partdep for pregnant 
## calculating partdep for glucose 
## calculating partdep for triceps 
## calculating partdep for insulin 
## calculating partdep for mass 
## calculating partdep for age 
## calculating partdep for pregnant:glucose 01234567890
## calculating partdep for pregnant:triceps 01234567890
## calculating partdep for pregnant:insulin 01234567890
## calculating partdep for pregnant:mass 01234567890
## calculating partdep for pregnant:age 01234567890
## calculating partdep for glucose:triceps 01234567890
## calculating partdep for glucose:insulin 01234567890
## calculating partdep for glucose:mass 01234567890
## calculating partdep for glucose:age 01234567890
## calculating partdep for triceps:insulin 01234567890
## calculating partdep for triceps:mass 01234567890
## calculating partdep for triceps:age 01234567890
## calculating partdep for insulin:mass 01234567890
## calculating partdep for insulin:age 01234567890
## calculating partdep for mass:age 01234567890

plotmo(gam1, all2=TRUE, pmethod = "apartdep", # faster version of pmethod="partdep"
       caption = "What have I gotten myself in to...") 
## calculating apartdep for pregnant 
## calculating apartdep for glucose 
## calculating apartdep for triceps 
## calculating apartdep for insulin 
## calculating apartdep for mass 
## calculating apartdep for age 
## calculating apartdep for pregnant:glucose 01234567890
## calculating apartdep for pregnant:triceps 01234567890
## calculating apartdep for pregnant:insulin 01234567890
## calculating apartdep for pregnant:mass 01234567890
## calculating apartdep for pregnant:age 01234567890
## calculating apartdep for glucose:triceps 01234567890
## calculating apartdep for glucose:insulin 01234567890
## calculating apartdep for glucose:mass 01234567890
## calculating apartdep for glucose:age 01234567890
## calculating apartdep for triceps:insulin 01234567890
## calculating apartdep for triceps:mass 01234567890
## calculating apartdep for triceps:age 01234567890
## calculating apartdep for insulin:mass 01234567890
## calculating apartdep for insulin:age 01234567890
## calculating apartdep for mass:age 01234567890

# let's play around with a few more parameters! 
plotmo(gam1, all2=TRUE, pt.col = "green3")
##  plotmo grid:    pregnant glucose triceps insulin mass age
##                         3     117      29     125 32.3  29

plotmo(gam1, all2=TRUE, pt.col = "green3", smooth.col = "red")
##  plotmo grid:    pregnant glucose triceps insulin mass age
##                         3     117      29     125 32.3  29

plotmo(gam1, all2=TRUE,  
       pt.col = "green3", 
       smooth.col = "red",
       grid.col="gray80")
##  plotmo grid:    pregnant glucose triceps insulin mass age
##                         3     117      29     125 32.3  29

# return just some of the plots! 
plotmo(gam1, all2=TRUE, degree1 = c(1,2), degree2=0) # show just the first two predictor plots
##  plotmo grid:    pregnant glucose triceps insulin mass age
##                         3     117      29     125 32.3  29

plotmo(gam1, all2=TRUE, degree1 = 0, degree2 = 1, # return just glucose v. pregnant perspective plot
       caption = "this is called a 'perspective plot'")

# return a contour plot
plotmo(gam1, all2=TRUE, degree1 = 0, degree2 = 1, type2 = "contour", 
       contour.col="royalblue",
       contour.labcex=1,
       main="this is called a 'contour plot'")

# return a color image 
plotmo(gam1, all2=TRUE, degree1 = 0, degree2 = 1, type2 = "image", 
       image.col=heat.colors(12),
       main="this is a color image", 
       caption="this is getting pretty crazy!")

# etc... 

See Wood S. 2006. Generalized additive models: An introduction with R for expert explanations.

Also check out Stephen Millborrow’s excellent instructions on the “plotmo” R package

Multivariate adaptive regression splines (MARS)

Multivariate adaptive regression splines (MARS) are a technique developed by Jerome H. Friedman in 1991 and copyrighted by Salford Systems. Open source implementations are thusly referred to as “Earth”, hence the name of the “earth” R package we are using today.

MARS approaches use “surrogate features” (or, models of the models), usually versions of one or two predictors at a time. Each predictor is divided into two groups and each group models the outcome variable for each group. This creates a “piecewise linear model” where each new feature is some proportion of the data.

Group definitions are provided via linear regression models! Those with the smallest error are used. See Kuhn and Johnson, 2016:145 for more information.

# recombine our Y outcome ("diabetes") and X predictors into a single data frame
mars_df <- cbind(Y, X)

# fit the model
mars1 <- earth(Y ~ ., data=mars_df, pmethod="backward", nprune=20, nfold=10)

# view summary output
summary(mars1)
## Call: earth(formula=Y~., data=mars_df, pmethod="backward", nprune=20,
##             nfold=10)
## 
##                   coefficients
## (Intercept)         103.146207
## miss_insulin          2.465922
## h(143-glucose)       -0.060947
## h(mass-24.2)          0.500866
## h(pedigree-0.231)   -35.095069
## h(pedigree-0.546)    18.218325
## h(1.4-pedigree)     -22.745165
## h(51-age)            -0.331124
## 
## Selected 8 of 20 terms, and 5 of 13 predictors
## Termination condition: Reached nk 27
## Importance: age, mass, miss_insulin, glucose, pedigree, ...
## Number of terms at each degree of interaction: 1 7 (additive model)
## GCV 117.2566  RSS 86569.7  GRSq 0.1997205  RSq 0.2286688  CVRSq 0.140132
## 
## Note: the cross-validation sd's below are standard deviations across folds
## 
## Cross validation:   nterms 8.40 sd 2.07    nvars 5.10 sd 0.57
## 
##      CVRSq    sd     MaxErr sd
##       0.14 0.128        -56 38
# view predictor importance
evimp(mars1)
##                nsubsets   gcv    rss
## age                   7 100.0  100.0
## mass                  6  65.0   69.0
## miss_insulin          5  28.4   40.0
## glucose               4  21.0   33.2
## pedigree              1  15.7   19.7
## insulin-unused        1 -10.0    9.8
# compute predicted values
mars_pred <- predict(mars1)

# print accuracy
(mse <- mean((mars_df$Y - mars_pred)^2))
## [1] 112.721
# plot
pdf("mars1 plot.pdf")
plot(mars1)
dev.off()
## quartz_off_screen 
##                 2
# also
pdf("mars2 plotmo.pdf")
plotmo(mars1)
##  plotmo grid:    pregnant glucose triceps insulin mass pedigree age
##                         3     117      29     125 32.3   0.3725  29
##  diabetes miss_glucose miss_pressure miss_triceps miss_insulin miss_mass
##         0            0             0            0            0         0
dev.off()
## quartz_off_screen 
##                 2
# 3d MARS plots!
plotmo(mars1)
##  plotmo grid:    pregnant glucose triceps insulin mass pedigree age
##                         3     117      29     125 32.3   0.3725  29
##  diabetes miss_glucose miss_pressure miss_triceps miss_insulin miss_mass
##         0            0             0            0            0         0

# same syntactical rules apply here as well
plotmo(mars1, all2=TRUE)
##  plotmo grid:    pregnant glucose triceps insulin mass pedigree age
##                         3     117      29     125 32.3   0.3725  29
##  diabetes miss_glucose miss_pressure miss_triceps miss_insulin miss_mass
##         0            0             0            0            0         0

plotmo(mars1, all2=TRUE, degree1=0, degree2=9,
       caption = "what is the influence of 'miss_insulin'?") 

explain plot outputs here

In summary(mars1):

The coefficients are …

What terms and predictors were selected?

What is a termination condition?

GVC

RSS

GRSq

RSq

CVRSq

CV sd’s are sd’s across folds?

In evimp(mars1):

nsubsets

gvc

rss

Acknowledgements

I express sincere gratitude to the authors of cited work in this file, in addition to: “earth” R package “mgcv” R package “plotmo” R package